



# === Replication data for Bureaucratic Responsiveness in Humanitarian Aid, Appendix A === #
# === The data here is used to replicate Tables A1, A2, A5, and A6. === #
# === The replication data for the topic model, including that used to generate Tables A3 and A4 can be found in the topic model script === #

library(dplyr)
library(ggplot2)
library(fixest)
library(broom)
library(mice)
library(miceadds)
library(purrr)
library(tidyr)
library(stringr)
library(ggcorrplot)
library(stargazer)


#load and transform data
data <- read.csv(file = "maindata_fpa_replication.csv")

data$affected_ofda[data$affected_ofda == 0] <- NA

numeric_vars <- sapply(data, is.numeric)

for (var in names(data)[numeric_vars]) {
  temp <- data[[var]]
  
  temp[temp == 0] <- 0.1
  
  if (var == "lag_trade_bal") {
    data[[paste0("log_", var)]] <- sign(temp) * log(abs(temp))
  } else {
    temp[temp < 0 | is.na(temp)] <- NA
    data[[paste0("log_", var)]] <- log(temp)
  }
}

#create summary statistics table
sum_data <- data %>% select(-case, -year)

summary_table <- sum_data %>%
  summarise(across(everything(), 
                   list(
                     Mean = ~mean(.x, na.rm = TRUE),
                     SD = ~sd(.x, na.rm = TRUE),
                     Min = ~min(.x, na.rm = TRUE),
                     Max = ~max(.x, na.rm = TRUE),
                     Median = ~median(.x, na.rm = TRUE),
                     N = ~sum(!is.na(.x))
                   ), 
                   .names = "{.col}_{.fn}"))

summary_table_long <- summary_table %>%
  pivot_longer(cols = everything(),
               names_to = "name",
               values_to = "Value") %>%
  mutate(
    Statistic = str_extract(name, "[^_]+$"),             
    Variable = str_replace(name, "_[^_]+$", "")          
  ) %>%
  select(Variable, Statistic, Value) %>%
  pivot_wider(names_from = Statistic, values_from = Value)

variable_order <- c("adj_ofda", "med_words", "org_med_words", "med_h", "affected_ofda",
                   "lag_trade_bal", "lag_adj_gdp", "lag_pop", "ch_words", "troops",
                   "log_adj_ofda", "log_med_words", "log_org_med_words", "log_med_h",
                   "log_affected_ofda", "log_lag_trade_bal", "log_lag_adj_gdp", 
                   "log_lag_pop", "log_ch_words", "log_troops")

summary_table_long <- summary_table_long %>%
  mutate(Variable = factor(Variable, levels = variable_order)) %>%
  arrange(Variable)


# === Table A1. Summary Statistics for All Variables === #
print(summary_table_long)

#create correlation matrix
cor_data <- data %>% 
  select(log_adj_ofda, log_med_words, log_med_h,
         log_ch_words, log_affected_ofda, log_lag_trade_bal,
         log_lag_adj_gdp, log_lag_pop)

cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")

print(cor_matrix)

# === Table A2. Correlation Matrix" === #
stargazer(cor_matrix, type = "text", title = "Correlation Matrix")


#run regressions
#center variables
vars_to_center <- c("log_med_h", "log_med_words", "log_ch_words", "log_affected_ofda", 
                    "log_lag_trade_bal", "log_lag_adj_gdp", "log_lag_pop")

data <- data %>%
  mutate(across(all_of(vars_to_center), ~ as.numeric(scale(. , center = TRUE, scale = FALSE)), .names = "centered_{.col}"))

data <- data %>% select(case, year, log_adj_ofda, log_med_h,
                        log_med_words, log_ch_words,
                        log_affected_ofda, log_lag_trade_bal, log_lag_adj_gdp,
                        log_lag_pop, centered_log_med_h, centered_log_med_words,
                        centered_log_ch_words, centered_log_affected_ofda,
                        centered_log_lag_trade_bal, centered_log_lag_adj_gdp,
                        centered_log_lag_pop)

tempData <- mice(data, m=5, maxit=50, meth='pmm', seed=500, print = FALSE)
datlist <- mids2datlist(tempData)

mod1 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod1, coef)
vars <- lapply(mod1, vcov)
sum1 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod2 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod2, coef)
vars <- lapply(mod2, vcov)
sum2 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)



mod3 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod3, coef)
vars <- lapply(mod3, vcov)
sum3 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod4 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod4, coef)
vars <- lapply(mod4, vcov)
sum4 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod5 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod5, coef)
vars <- lapply(mod5, vcov)
sum5 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod6 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               centered_log_ch_words +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod6, coef)
vars <- lapply(mod6, vcov)
sum6 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

#extract coefficients, se, p-values and compile into a table
process_summary <- function(sum_table, model_name) {
  df <- data.frame(
    term = rownames(sum_table),           
    results = sum_table[, "results"],    
    se = sum_table[, "se"],       
    p = sum_table[, "p"],         
    model = model_name                    
  ) %>%
    filter(!grepl("case|year", term)) %>%
    mutate(
      # Append significance markers to results
      results = case_when(
        p < 0.01 ~ paste0(results, "***"),
        p < 0.05 ~ paste0(results, "**"),
        p < 0.101 ~ paste0(results, "*"),
        TRUE ~ as.character(results)
      )
    ) %>%
    select(-p)  # Remove p-values column
  
  return(df)
}

summary_tables <- list(sum1, sum2, sum3, sum4, sum5, sum6)
model_names <- c("Model 1", "Model 2", "Model 3", "Model 4", "Model 5", "Model 6")
combined_data <- map2_df(summary_tables, model_names, process_summary)

formatted_table <- combined_data %>%
  pivot_wider(names_from = model, values_from = c(results, se))

# === Table A5. Estimated Coefficients of Aid Without Country Fixed Effects === #
print(formatted_table)


#calculate r-squared
calc_r_squared <- function(model_list, datlist, formula) {
  r2_values <- sapply(1:length(datlist), function(i) {
    model <- model_list[[i]]
    data <- datlist[[i]]
    
    # Ensure complete cases for all variables in the model
    complete_data <- model.frame(as.formula(formula), data = data, na.action = na.omit)
    
    # Extract response variable
    y <- complete_data$log_adj_ofda
    
    # Construct design matrix
    X <- model.matrix(as.formula(formula), data = complete_data)
    
    # Extract coefficients
    beta <- coef(model)
    
    # Ensure column names in X match the coefficient names in beta
    common_vars <- intersect(names(beta), colnames(X))
    X <- X[, common_vars, drop = FALSE]
    beta <- beta[common_vars]
    
    # Compute fitted values
    y_hat <- X %*% beta  
    
    # Compute R-squared
    ss_total <- sum((y - mean(y, na.rm = TRUE))^2)
    ss_residual <- sum((y - y_hat)^2)
    1 - (ss_residual / ss_total)
  })
  
  # Pooling R^2 using Rubin�fs Rules
  m <- length(r2_values)
  r_bar <- mean(r2_values)
  b_var <- var(r2_values)
  w_var <- mean((r2_values - r_bar)^2)
  total_var <- ((m + 1) / m) * b_var - (1 / m) * w_var
  
  lower_ci <- r_bar - 1.96 * sqrt(total_var)
  upper_ci <- r_bar + 1.96 * sqrt(total_var)
  
  return(list(R_squared = r_bar, CI = c(lower_ci, upper_ci)))
}


models <- list(mod1, mod2, mod3, mod4, mod5, mod6)
formulas <- list(
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + centered_log_ch_words +
    factor(year) +
    centered_log_med_h * centered_log_med_words
)

r2_results <- lapply(seq_along(models), function(i) {
  cat("\nCalculating R2 for Model", i, "...\n")
  calc_r_squared(models[[i]], datlist, formulas[[i]])
})

# Convert results into a data frame for easy viewing
r2_df <- data.frame(
  Model = paste0("mod", 1:6),
  R_squared = sapply(r2_results, function(x) x$R_squared),
  CI_lower = sapply(r2_results, function(x) x$CI[1]),
  CI_upper = sapply(r2_results, function(x) x$CI[2])
)

# === Table A5. Estimated Coefficients of Aid Without Country Fixed Effects, R-squared Values === #
print(r2_df)



#run additional regressions excluding outliers
#filter outliers
data <- data[!grepl("Iraq|Afghan|Syria", data$case), ]

#run the multiple imputation regressions
tempData <- mice(data, m=5, maxit=50, meth='pmm', seed=500, print = FALSE)
datlist <- mids2datlist(tempData)

mod1 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod1, coef)
vars <- lapply(mod1, vcov)
sum1 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod2 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod2, coef)
vars <- lapply(mod2, vcov)
sum2 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)



mod3 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod3, coef)
vars <- lapply(mod3, vcov)
sum3 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod4 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod4, coef)
vars <- lapply(mod4, vcov)
sum4 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod5 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod5, coef)
vars <- lapply(mod5, vcov)
sum5 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod6 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               centered_log_ch_words +
               factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod6, coef)
vars <- lapply(mod6, vcov)
sum6 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

#extract coefficients, se, p-values and compile into a table
process_summary <- function(sum_table, model_name) {
  df <- data.frame(
    term = rownames(sum_table),           
    results = sum_table[, "results"],    
    se = sum_table[, "se"],       
    p = sum_table[, "p"],         
    model = model_name                    
  ) %>%
    filter(!grepl("case|year", term)) %>%
    mutate(
      # Append significance markers to results
      results = case_when(
        p < 0.01 ~ paste0(results, "***"),
        p < 0.05 ~ paste0(results, "**"),
        p < 0.101 ~ paste0(results, "*"),
        TRUE ~ as.character(results)
      )
    ) %>%
    select(-p)  # Remove p-values column
  
  return(df)
}

summary_tables <- list(sum1, sum2, sum3, sum4, sum5, sum6)
model_names <- c("Model 1", "Model 2", "Model 3", "Model 4", "Model 5", "Model 6")
combined_data <- map2_df(summary_tables, model_names, process_summary)

formatted_table <- combined_data %>%
  pivot_wider(names_from = model, values_from = c(results, se))

# === Table A6. Estimated Coefficients of Aid Excluding Iraq, Syria, and Afghanistan Cases === #
print(formatted_table)


#calculate r-squared
calc_r_squared <- function(model_list, datlist, formula) {
  r2_values <- sapply(1:length(datlist), function(i) {
    model <- model_list[[i]]
    data <- datlist[[i]]
    
    # Ensure complete cases for all variables in the model
    complete_data <- model.frame(as.formula(formula), data = data, na.action = na.omit)
    
    # Extract response variable
    y <- complete_data$log_adj_ofda
    
    # Construct design matrix
    X <- model.matrix(as.formula(formula), data = complete_data)
    
    # Extract coefficients
    beta <- coef(model)
    
    # Ensure column names in X match the coefficient names in beta
    common_vars <- intersect(names(beta), colnames(X))
    X <- X[, common_vars, drop = FALSE]
    beta <- beta[common_vars]
    
    # Compute fitted values
    y_hat <- X %*% beta  
    
    # Compute R-squared
    ss_total <- sum((y - mean(y, na.rm = TRUE))^2)
    ss_residual <- sum((y - y_hat)^2)
    1 - (ss_residual / ss_total)
  })
  
  # Pooling R^2 using Rubin�fs Rules
  m <- length(r2_values)
  r_bar <- mean(r2_values)
  b_var <- var(r2_values)
  w_var <- mean((r2_values - r_bar)^2)
  total_var <- ((m + 1) / m) * b_var - (1 / m) * w_var
  
  lower_ci <- r_bar - 1.96 * sqrt(total_var)
  upper_ci <- r_bar + 1.96 * sqrt(total_var)
  
  return(list(R_squared = r_bar, CI = c(lower_ci, upper_ci)))
}


models <- list(mod1, mod2, mod3, mod4, mod5, mod6)
formulas <- list(
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + centered_log_ch_words +
    factor(year) +
    centered_log_med_h * centered_log_med_words
)

r2_results <- lapply(seq_along(models), function(i) {
  cat("\nCalculating R2 for Model", i, "...\n")
  calc_r_squared(models[[i]], datlist, formulas[[i]])
})

# Convert results into a data frame for easy viewing
r2_df <- data.frame(
  Model = paste0("mod", 1:6),
  R_squared = sapply(r2_results, function(x) x$R_squared),
  CI_lower = sapply(r2_results, function(x) x$CI[1]),
  CI_upper = sapply(r2_results, function(x) x$CI[2])
)

# === Table A6. Estimated Coefficients of Aid Excluding Iraq, Syria, and Afghanistan Cases, R-Squared Values === #
print(r2_df)

sink()